#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = "HuJiang";

import requests;
from bs4 import BeautifulSoup;
import re;
from entity.Double_Color_Ball import *;
from datetime import datetime;

'''
双色球爬虫
'''

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0"
}

# 获取所有数据，返回所有数据集合
def get_all_data(page_count=151):
    all_data = [];
    url_list = list("http://kaijiang.zhcw.com/zhcw/html/ssq/list_%d.html" % page
                    for page in range(2, page_count + 1));
    url_list.insert(0, "http://kaijiang.zhcw.com/zhcw/html/ssq/list.html");
    for url in url_list:
        all_data += get_dcb_page_data(url);

    return all_data;

# 获取单页数据，返回每页数据list
def get_dcb_page_data(url):
    print(url);

    dcbs = [];

    # 数据爬取
    r = requests.get(url, headers=headers);
    if r.status_code == 200:
        r.encoding = r.apparent_encoding;

        # 数据解析和选择
        bs = BeautifulSoup(markup=r.text, features="html.parser");
        tr_list = bs.find_all(name="tr");
        for tr in tr_list:
            td_list = tr.find_all(name="td", attrs={"align":"center"});
            if len(td_list) <= 1:
                continue;

            sales_temp = re.findall('<td><strong>(.*?)</strong></td>', str(tr))[0].strip();
            sales = 0 if len(sales_temp) == 0 else float(sales_temp.replace(",", ""));
            first_num_temp = re.findall('<td align="left" style="color:#999;"><strong>(.*?)</strong>',
                                        str(tr))[0].strip();
            first_num = 0 if len(first_num_temp) == 0 else int(first_num_temp);
            second_num_temp = re.findall('<td align="center"><strong class="rc">(.*?)</strong></td>',
                                        str(tr))[0].strip();
            second_num = 0 if len(second_num_temp) == 0 else int(second_num_temp);
            dcb = Double_Color_Ball(
                issue_no = td_list[1].get_text(),
                aware_date = td_list[0].get_text(),
                red_ball = " ".join(re.findall('<em class="rr">(.*?)</em>', str(tr))),
                blue_ball = re.findall('<em>(.*?)</em>', str(tr))[0],
                sales = sales,
                first_number = first_num,
                second_number = second_num,
                create_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            );
            print(dcb.__dict__);
            dcbs.append(dcb);

    return dcbs;

if __name__ == '__main__':
    url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list.html";
    get_dcb_page_data(url);
    all_data = get_all_data(5);
    print(all_data);
